In [270]:
import requests
from bs4 import BeautifulSoup
In [271]:
import os
os.getcwd()
os.chdir('/Users/ItelinaMa/Documents/Metis/Luther')
In [272]:
urllist = ['http://www.boxofficemojo.com/daily/?view=bymovie&yr=2015&page=1&sort=title&order=ASC&p=.htm', 'http://www.boxofficemojo.com/daily/?view=bymovie&yr=2015&page=2&sort=title&order=ASC&p=.htm']
url = urllist[1]
In [273]:
def returnSoup(urllist):
soups = []
for url in urllist:
response = requests.get(url)
print response.status_code
soup = BeautifulSoup(response.text)
soups.append(soup)
return soups
In [274]:
soups = returnSoup(urllist)
In [275]:
def makeData(soups):
releasegross = []
movienames = []
releasedate =[]
studio = []
for soup in soups:
tablelength = len(soup.find_all('table')[1].find_all('tr'))
for i in range(2, tablelength):
releasegross.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[3].text)
movienames.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[0].text)
releasedate.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[4].text)
studio.append(soup.find_all('table')[1].find_all('tr')[i].find_all('td')[1].text)
boxoffice ={}
for i, item in enumerate(movienames):
boxoffice[item] = zip(studio, releasegross, releasedate)[i]
return boxoffice
In [276]:
boxoffice = makeData(soups)
In [280]:
len(boxoffice) == 156
Out[280]:
In [281]:
import pickle
with open('boxofficedata.pkl', 'w') as picklefile:
pickle.dump(boxoffice, picklefile)